In [ ]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
In [ ]:
#dataload
BRCA1_raw = pd.read_csv("Resources/BRCA1_HUMAN_Findlay_2018.csv")

Domainsplit: BRCA1_raw -> RING & BRCT¶

In [ ]:
#code to split our dataframe into its domains

RING = BRCA1_raw[:575]
BRCT = BRCA1_raw[575:]

Secondary structure analysis¶

In [ ]:
# add column named: 'position'
df=BRCA1_raw.copy()

import re 

def get_number_from_text(text):
    return re.findall(r'\d{1,4}',text)[0]

df['position']=df['mutant'].apply(get_number_from_text)
In [ ]:
# add column containing the newly introduced Aminoacid
test1=df.mutant.str.extract(r'(\d)([ARNDCQEGHILKTMfPOUSTWYV])')
df["new_aa"]=test1[1]
In [ ]:
# dataset with only bin0s
df_filtered = df[df['DMS_score_bin'] == 0]
In [ ]:
#CHECKING FOR SECONDARY STRUCTURES -> based on Psipred

#Helices:
helices = ['8', '9', '10',"11","12","13","14","15","16","17","18","19","20","21","22","28","29","30","31","41","42","43","44","45","46","47","48","49","50","51","52","70","71","72","73","78","79","80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97","1659","1660","1661","1662","1663","1664","1665","1666","1667","1668","1669","1670","1671","1701","1702","1703","1704","1705","1706","1707","1708","1716","1717","1718","1719","1720","1721","1722","1723","1724","1725","1749","1750","1751","1752","1753","1754","1755","1777","1778","1779","1780","1781","1782","1783","1784","1785","1786","1820","1821","1822","1823","1824","1825","1826","1827","1835","1836","1837","1838","1839","1840","1841","1842","1843","1844"]
helices_df = df.loc[df['position'].astype(str).isin(helices)]

#Helices_bin0:
helices_df_bin0 = helices_df.loc[helices_df['DMS_score_bin'] == 0].copy()

#Strands:
strands = ["1650","1651","1652","1653","1654","1674","1675","1676","1677","1684","1685","1686","1687","1688","1689","1712","1713","1714","1715","1765","1766","1767","1768","1769","1807","1808","1809","1810"]
strands_df = df.loc[df['position'].astype(str).isin(strands)]

#Strands_bin0:
strands_df_bin0 = strands_df.loc[strands_df['DMS_score_bin'] == 0].copy()

#Coils (improved technique :D):
df['position'] = df['position'].astype(int)
coils = list(range(1, 8)) + list(range(22, 35)) + list(range(37, 45))+ list(range(55, 67)) + list(range(69, 77)) + list(range(98, 102)) + list(range(1631, 1651))+ list(range(1656, 1660)) + list(range(1673, 1676)) + list(range(1679, 1686)) + list(range(1691, 1702)) + list(range(1710, 1715))+ list(range(1727, 1736))+ list(range(1738, 1753))+ list(range(1757, 1767))+ list(range(1771, 1778))+ list(range(1788, 1791))+ list(range(1794, 1796))+ list(range(1799, 1806))+ list(range(1812, 1821))+ list(range(1829, 1833))+ list(range(1846, 1851))+ list(range(1854, 1865))
coils_df = df.loc[df['position'].isin(coils)]

#Coils_bin0:
coils_df_bin0 = coils_df.loc[coils_df['DMS_score_bin'] == 0].copy()

# Testing for overlaps:
set1 = set(helices)
set2 = set(strands)
set3 = set(coils)

common_elements = set1.intersection(set2, set3)

common_elements_list = list(common_elements)
common_elements_list

#-> no overlaps, yay
Out[ ]:
[]

RING¶

In [ ]:
#RING:

#Helices in RING
helices_RING = ['8', '9', '10',"11","12","13","14","15","16","17","18","19","20","21","22","28","29","30","31","41","42","43","44","45","46","47","48","49","50","51","52","70","71","72","73","78","79","80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97"]
helices_df_RING = df.loc[df['position'].astype(str).isin(helices_RING)]
helices_df_RING_bin0 = helices_df_RING.loc[helices_df_RING['DMS_score_bin'] == 0].copy()

#strands in RING (there are none)

#coils in RING
df['position'] = df['position'].astype(int)
coils_RING = list(range(1, 8)) + list(range(22, 35)) + list(range(37, 45))+ list(range(55, 67)) + list(range(69, 77)) + list(range(98, 102)) 
coils_df_RING = df.loc[df['position'].isin(coils_RING)]
coils_df_RING_bin0 = coils_df_RING.loc[coils_df_RING['DMS_score_bin'] == 0].copy()

#Barplot + ratios: 
df_counts = [len(helices_df_RING),len(helices_df_RING_bin0),0,0, len(coils_df_RING),len(coils_df_RING_bin0)]
df_names = ['α-helices',"h_bin0", 'β-strands',"s_bin0",'coils',"c_bin0"]
colors = ['crimson', 'crimson', 'indigo', 'indigo', 'dimgrey', 'dimgrey']
plt.bar(df_names, df_counts,color=colors)
plt.xlabel('Secondary structures')
plt.ylabel('Amount of SNVs')
plt.title('SNVs per secondary structure type: RING')
plt.show()
dataframes = [helices_df_RING, helices_df_RING_bin0, coils_df_RING, coils_df_RING_bin0]
df_names = ['helices_R', 'helices_bin0_R', 'coils_R', 'coils_bin0_R']
ratios = []
for i in range(1, len(dataframes), 2):
    ratio = (len(dataframes[i]) / len(dataframes[i-1])) * 100
    ratios.append(ratio)
for i, ratio in enumerate(ratios):
    print(f"ratio: {df_names[i*2+1]} to {df_names[i*2]}: {ratio:.2f}%")
ratio: helices_bin0_R to helices_R: 30.77%
ratio: coils_bin0_R to coils_R: 29.58%
In [ ]:
#Chi2, RING:
#H0: no correlation
len_hRING=len(helices_df_RING)
len_hRING0=len(helices_df_RING_bin0)
len_cRING=len(coils_df_RING)
len_cRING0=len(coils_df_RING_bin0)

observed_data_RING = np.array([[len_cRING, len_hRING],[len_cRING0, len_hRING0]])
chi2, p_value, dof, expected = chi2_contingency(observed_data_RING)

print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print("Expected counts:")
print(expected)
Chi-square statistic: 0.022393889272595308
P-value: 0.8810440090537792
Degrees of freedom: 1
Expected counts:
[[282.61286255 326.38713745]
 [ 85.38713745  98.61286255]]

Since the P-value is >0.05 there is no strong indication to reject the H0 hypothesis. So in this domain the chances of a SNV leading to a bin_0 score is the same no matter what secondary structure.

In [ ]:
#Vizualizing secondary structures in RING

RING = df[:575]
new_RING = RING.copy()
new_RING['secondary structures'] = RING['position'].apply(lambda x: 'H' if str(x) in helices_RING else 'C')

positions = new_RING['position']
structures = new_RING['secondary structures']

colors = ['crimson' if structure == 'H' else 'dimgrey' for structure in structures]

# scatter plot:
plt.scatter(positions, structures, c=colors)
plt.xticks(positions, rotation='vertical')
plt.xlabel('Position', fontsize=15)
plt.ylabel('Secondary Structures', fontsize=15)
plt.title('Position vs. Secondary Structure - RING', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(20, 3)
plt.gca().yaxis.set_tick_params(labelsize=10)
plt.show()
In [ ]:
#secondary structure + dms score_RING:
new_RING_agg = new_RING.pivot_table(index='secondary structures', columns='mutant', values='DMS_score')

# Get the order of mutants from the original DataFrame
mutant_order = new_RING['mutant'].unique()

# Reindex the columns of the pivot table to match the desired order
new_RING_agg = new_RING_agg.reindex(index=new_RING_agg.index[::-1],columns=mutant_order)

# Plot creation
plt.figure(figsize=(150, 30)) 

# heatmap
from matplotlib.colors import ListedColormap
colors = ['indianred', 'royalblue']
cmap = ListedColormap(colors)

heatmap=sns.heatmap(new_RING_agg, cmap=cmap,linewidths=0.5, fmt=".2f",cbar_kws={"label":"DMS_scores"})
plt.xlabel("Mutant", fontsize=100) 
plt.ylabel("Secondary Structures", fontsize=100) 
 
heatmap.set_yticklabels(heatmap.get_yticklabels(), fontsize=70) 
cbar = heatmap.collections[0].colorbar
cbar.ax.tick_params(labelsize=70)
cbar.ax.set_ylabel('DMS_score_bin', fontsize=100)

data_min = new_RING_agg.min().min()
data_max = new_RING_agg.max().max()
cbar.set_ticks([data_min, data_max])
cbar.set_ticklabels([0, 1])

plt.show()

BRCT¶

In [ ]:
#BRCT:

#Helices in BRCT
helices_BRCT = ["1659","1660","1661","1662","1663","1664","1665","1666","1667","1668","1669","1670","1671","1701","1702","1703","1704","1705","1706","1707","1708","1716","1717","1718","1719","1720","1721","1722","1723","1724","1725","1749","1750","1751","1752","1753","1754","1755","1777","1778","1779","1780","1781","1782","1783","1784","1785","1786","1820","1821","1822","1823","1824","1825","1826","1827","1835","1836","1837","1838","1839","1840","1841","1842","1843","1844"]
helices_df_BRCT = df.loc[df['position'].astype(str).isin(helices_BRCT)]
helices_df_BRCT_bin0 = helices_df_BRCT.loc[helices_df_BRCT['DMS_score_bin'] == 0].copy()

#strands in BRCT
strands_BRCT = ["1650","1651","1652","1653","1654","1674","1675","1676","1677","1684","1685","1686","1687","1688","1689","1712","1713","1714","1715","1765","1766","1767","1768","1769","1807","1808","1809","1810"]
strands_df_BRCT = df.loc[df['position'].astype(str).isin(strands_BRCT)]
strands_df_BRCT_bin0 = strands_df_BRCT.loc[strands_df_BRCT['DMS_score_bin'] == 0].copy()

#coils in BRCT
df['position'] = df['position'].astype(int)
coils_BRCT = list(range(1631, 1651))+ list(range(1656, 1660)) + list(range(1673, 1676)) + list(range(1679, 1686)) + list(range(1691, 1702)) + list(range(1710, 1715))+ list(range(1727, 1736))+ list(range(1738, 1753))+ list(range(1757, 1767))+ list(range(1771, 1778))+ list(range(1788, 1791))+ list(range(1794, 1796))+ list(range(1799, 1806))+ list(range(1812, 1821))+ list(range(1829, 1833))+ list(range(1846, 1851))+ list(range(1854, 1865)) 
coils_df_BRCT = df.loc[df['position'].isin(coils_BRCT)]
coils_df_BRCT_bin0 = coils_df_BRCT.loc[coils_df_BRCT['DMS_score_bin'] == 0].copy()

#Barplot+ratios:
df_counts = [len(helices_df_BRCT),len(helices_df_BRCT_bin0),len(strands_df_BRCT),len(strands_df_BRCT_bin0), len(coils_df_BRCT),len(coils_df_BRCT_bin0)]
df_names = ['α-helices',"h_bin0", 'β-strands',"s_bin0",'coils',"c_bin0"]
colors = ['crimson', 'crimson', 'indigo', 'indigo', 'dimgrey', 'dimgrey']
plt.bar(df_names, df_counts,color=colors)
plt.xlabel('Secondary structures')
plt.ylabel('Amount of SNVs')
plt.title('SNVs per secondary structure type: BRCT')
plt.show()
dataframes = [helices_df_BRCT, helices_df_BRCT_bin0, strands_df_BRCT,strands_df_BRCT_bin0,coils_df_BRCT, coils_df_BRCT_bin0]
df_names = ['helices_B', 'h_bin0_B',"strands_B","s_bin0_B", 'coils_B', 'coils_bin0_B']
ratios = []
for i in range(1, len(dataframes), 2):
    ratio = (len(dataframes[i]) / len(dataframes[i-1])) * 100
    ratios.append(ratio)
for i, ratio in enumerate(ratios):
    print(f"ratio: {df_names[i*2+1]} to {df_names[i*2]}: {ratio:.2f}%")
ratio: h_bin0_B to helices_B: 27.45%
ratio: s_bin0_B to strands_B: 40.54%
ratio: coils_bin0_B to coils_B: 21.57%
In [ ]:
#Chi2, BRCT:
#H0: no correlation
len_hBRCT=len(helices_df_BRCT)
len_hBRCT0=len(helices_df_BRCT_bin0)
len_cBRCT=len(coils_df_BRCT)
len_cBRCT0=len(coils_df_BRCT_bin0)
len_sBRCT=len(strands_df_BRCT)
len_sBRCT0=len(strands_df_BRCT_bin0)

observed_data_BRCT = np.array([[len_cBRCT, len_hRING, len_sBRCT],[len_cBRCT0, len_hBRCT0, len_sBRCT0]])
chi2, p_value, dof, expected = chi2_contingency(observed_data_BRCT)

print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print("Expected counts:")
print(expected)
Chi-square statistic: 15.036107804784002
P-value: 0.0005431886360097251
Degrees of freedom: 2
Expected counts:
[[672.2040404 336.4969697 164.2989899]
 [178.7959596  89.5030303  43.7010101]]

Since the P-value is <0.05 there is a strong indication to reject the H0 hypothesis. So in this domain the chances of a SNV leading to a bin_0 score is different for each secondary structure.

In [ ]:
BRCT = df[575:]
new_BRCT = BRCT.copy()
new_BRCT['secondary structures'] = BRCT['position'].apply(lambda x: 'S' if str(x) in strands_BRCT else ('H' if str(x) in helices_BRCT else 'C'))

positions = new_BRCT['position']
structures = new_BRCT['secondary structures']

colors = ['crimson' if structure == 'H' else 'dimgrey' if structure == "C" else "indigo" for structure in structures]

# scatter plot:
plt.scatter(positions, structures, c=colors)

# Set the tick positions and labels on the X-axis
tick_positions = np.arange(min(positions) -1, max(positions) + 1, 10)
plt.xticks(tick_positions, rotation='vertical')

plt.xlabel('Position', fontsize=30)
plt.ylabel('Secondary Structures', fontsize=30)
plt.title('Position vs. Secondary Structure - BRCT', fontsize=35)

fig = plt.gcf()
fig.set_size_inches(40, 6)

plt.gca().yaxis.set_tick_params(labelsize=20)
plt.show()
In [ ]:
#secondary structures + dms score_BRCT:

# Remove duplicates or aggregate the data
new_BRCT_agg = new_BRCT.pivot_table(index='secondary structures', columns='mutant', values='DMS_score_bin')

# Get the order of mutants from the original DataFrame
mutant_order1 = new_BRCT['mutant'].unique()

# Reindex the columns of the pivot table to match the desired order
new_BRCT_agg = new_BRCT_agg.reindex(index=new_BRCT_agg.index[::-1],columns=mutant_order1)

structure_order = new_BRCT_agg.index.tolist()
structure_order[0], structure_order[1] = structure_order[1], structure_order[0]
new_BRCT_agg = new_BRCT_agg.reindex(index=structure_order)

# Plot creation
from matplotlib.colors import ListedColormap
colors = ['indianred', 'royalblue']
cmap = ListedColormap(colors)

plt.figure(figsize=(350, 50))  
heatmap=sns.heatmap(new_BRCT_agg, cmap=cmap,linewidths=0.5, fmt=".2f",cbar_kws={"label":"DMS_score_bin"})
plt.xlabel("Mutant", fontsize=150)
plt.ylabel("Secondary Structures",fontsize=150)

heatmap.set_yticklabels(heatmap.get_yticklabels(), fontsize=120) 
cbar = heatmap.collections[0].colorbar
cbar.ax.tick_params(labelsize=100)
cbar.ax.set_ylabel('DMS_score_bin', fontsize=150)
cbar.set_ticks([0.25, 0.75])
cbar.set_ticklabels([0, 1])

plt.show()

Conservation Analysis¶

In [ ]:
#importing the data from the msa

fasta_file = {}
key = None
value = ['']
with open(r"Resources/aln-fasta_final.txt",'r') as f:
    for line in f:
        if line.startswith('>'):
            fasta_file[key] = "".join(value)
            reading=True
            key = line.strip()
            value = []
            continue
        else:
            value.append(line.strip())
In [ ]:
#create sequence logo with full sequence


def create_sequence_logo(sequence_data, split_positions=None, columns=1, figsize=(16, 8), scaling_factor=0.04):

    amino_acids_to_plot = ['-',"A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V",'X']
    # Filter out empty sequences
    sequence_data = {k: v for k, v in sequence_data.items() if v}

    # Calculate the maximum length of the sequences
    max_length = max(len(seq) for seq in sequence_data.values())

    # Create a dictionary to store the frequency of each amino acid at each position
    amino_acid_frequencies = {aa: [0] * max_length for aa in amino_acids_to_plot}

    # Iterate over the sequences and update the amino acid frequencies
    for sequence in sequence_data.values():
        for i, aa in enumerate(sequence):
            amino_acid_frequencies[aa][i] += 1

    # Determine the splitting positions
    if split_positions is None:
        split_positions = [0, max_length]

    # Adjust the number of columns if needed
    num_plots = len(split_positions) - 1
    rows = num_plots // columns + (num_plots % columns > 0)

    # Create subplots with increased vertical gap
    fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False, gridspec_kw={'hspace': 0.5})
    axs = axs.ravel()

    # Plotting the amino acid frequencies by position for each subset
    term = False
    for i, ax in enumerate(axs):
        start = split_positions[i]

        if i == num_plots-1:  # Handle the last subset separately
            print('end')
            end = max_length
            term = True

        else:
            end = split_positions[i + 1]
        subset_data = {k: v[start:end] for k, v in sequence_data.items()}

        # Plotting the amino acid frequencies by position for each amino acid
        for aa in amino_acids_to_plot:
            frequencies = amino_acid_frequencies[aa][start:end]

            if any(frequencies):
                ax.plot(range(1, len(frequencies) + 1), [f * scaling_factor for f in frequencies], label=aa)

        ax.set_xlim(1, end - start + 1)
        ax.set_xticks(range(1, end - start + 1))
        ax.set_xticklabels(range(start + 1, end + 1))
        ax.set_xlabel('Position')
        ax.set_ylabel('Frequency')

        ax.set_title(f"Sequence Logo - Position {start+1}-{end}")
        if term:
            break
    # Create a single legend outside the loop
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc='center', ncol=1)
#     fig.legend(handles, labels,  loc=(1.04, 0))

#     plt.tight_layout()
    plt.show()

split_positions = [0,  max(len(seq) for seq in fasta_file.values())]  # Manually specify splitting positions

create_sequence_logo(fasta_file, split_positions=split_positions, columns=2, figsize=(150, 10), scaling_factor=0.011)
end
In [ ]:
#create a consensus sequence

def create_consensus_sequence(sequence_data):
    # Filter out empty sequences
    sequence_data = {k: v for k, v in sequence_data.items() if v}

    # Calculate the maximum length of the sequences
    max_length = max(len(seq) for seq in sequence_data.values())

    # Create a dictionary to store the frequency of each amino acid at each position
    amino_acid_frequencies = {aa: [0] * max_length for aa in set(''.join(sequence_data.values()))}

    # Iterate over the sequences and update the amino acid frequencies
    for sequence in sequence_data.values():
        for i, aa in enumerate(sequence):
            amino_acid_frequencies[aa][i] += 1

    # Create the consensus sequence
    consensus_sequence = ''
    for i in range(max_length):
        max_frequency = 0
        max_amino_acid = ''
        for aa, frequencies in amino_acid_frequencies.items():
            frequency = frequencies[i]
            if frequency > max_frequency:
                max_frequency = frequency
                max_amino_acid = aa
        consensus_sequence += max_amino_acid

    return consensus_sequence



consensus_sequence = create_consensus_sequence(fasta_file)
consensus_sequence
Out[ ]:
'---------------------------------------------------------------------------------------------------------------------MDLSADRVEEVQNVLNAMQK---ILECPICLELIKEPVSTKCD-HIFCK---------FCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIIHAFELD---TGLQFAN-SYNFSKK-ENNSPEHLK-EEVSIIQSMGYRNRAKRLRQSEPENPTL--------QETSLSVQLSNLGIVRSLRTKQQIQPQNKSVYIELGSDSSEDTVNKASYCSVGDQELLQITPQGARAET---------------SLNSAKK--AACEFSEKDITNTEHHQSSNKDLNTTE-KHATERHPEKYQGISVSNLHVEPCGTNTHASSLQHENSSLLLTKDRMNVEKAEFCNKSKQPGLARSQQSRWAESKETCNDRQTPSTEKKVDLNADPLYGRKELNKQKPPCSESPRD-SQDVPWITLNSSIQKVNEWFSRSDELLTSD-DSHDGGSESNAEVAGALEV-------------------PNEVDGYSGSSEKIDLLASDPHEALICKSERVHSKPVESNIEDKIFGKTYRRKASLPNLSHV--TENLIIGAFVTEPQI-TQERPLTNKLKRKRRTTSGLHPEDFIKKADLAVVQKTPEKINQGTDQMEQNGQVMNITNNGHENKTKGDYVQKEKNANPTESLEKESAFRTKAEPISSSISNMELELNIHSSKAPKKNRLRRKSSTRHIHALELVVSRNPSPPNHTELQIDSCSSSEEMKK-KNSNQMPVRHSRKLQLMEDKEPATGAKKSNKPNEQISKRLASDTFPELKLTNVPGSFTNCSSSNKLQEFVNPSLQREEKEEKLETIQVSNSTKDPKDLMLSGG-RVLQTE---RSVESTSISLVPDTDYGTQDSISLLEADTLGKAKT-APNQCVSQCAAIENPKELIHGCSKDTRNDTEGFKDPLGHEVNH-SQETSIEMEESELDTQYLQNTFKVSKRQSFALFSNPGNPEKECATVSAHSRSLRKQSPKVTLECEQKEENQGKK-ESNIKHVQAVNTTAGFPVVCQKDKKPGDYAKCS--IKGVSRLCQSSQFRGNETELITANKHGISQNPYHIPPLSPIRSSVKTKCKKNLSEEKFEEHSMSPERAVGNENIIQSTVSTISQNNIRENAFKEASSSSINEVGSST-------NEVGSSINEVGSSGENIQAELGRNRGPKLNAVLRLGLMQPEVYKQSLPVSNCKHPEIKRQGENEGVVQAVNADFSPCLISDNLEQPMGSSHASQVCSETPDDLLDD--DEIKENTSFAESDIKERSAVFSKSVQRGEFRRSPSPFTH-THLAQGHQ-RGARKLESSEENLSSEDEELPCFQHLLFGKVTNTPS-QSTRHSTVATEGLSKKTEENLLSLKNSLNDC--SNQVILAKASQEHHLSEEARCSGSLFSSQCSALEDLTANT--------NTQDPFLMFDPPSKQMRHQSESQEVL-SDKELVSDD----EERETGLEEDN-QEEQSVDSN-LGEAASGYESETSLSEDCSGLSSQSDILTTQ-------QRDTMQDNLIKLQQEMAELEAVLEQHGSQPS-NSSPSLIADSCA--PEDLLNPEQNTSEK---------------------AVLTSEKSSEYPISQNPESLSADKFQVSLDSSTSK-NKEPGMER---------------------SSPSKSQLL-DDRWYVHSHSRSLQNRNCPSQEELIKVVDVEEQQLEKS-------------EPQDLMEQSYLPRQDLEGTP-YLESGISLFSDDPESDPSEDRAPEPAHVGSMPSSTSALKLPQFQVAESAKSPAAAHTTNTAGYNAREESVSREKPELTSSTER-VNKRISMVASGLTPKEFMLVQKFARKHHITLTNLITEETTHVIMKTDAEFVCERTLKYFLGIAGGKWVVSYFWVTQSIKERKMLDEHDFEVRGDVVNGRNHQGPKRARES-----QDRKIFRGLEICCYGPFTNMPT-------------------------------------------------------------------------DQLEWMVQLCGASVVKEPSSFTLGTG---THPVVVVQPDAWTEDSGFHAIGQMCEAPVVTREWVLDSVALYQCQELDT---YLIPQIPHSHY-----------------------------------'
In [ ]:
#define the sequence of human BRCA1

brca_human="---------------------------------------------------------------------------------------------------------------------MDLSALRVEEVQNVINAMQK---ILECPICLELIKEPVSTKCD-HIFCK---------FCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLD---TGLEYAN-SYNFAKK-ENNSPEHLK-DEVSIIQSMGYRNRAKRLLQSEPENPSL--------QETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYIELGSDSSEDTVNKATYCSVGDQELLQITPQGTRDEI---------------SLDSAKK--AACEFSETDVTNTEHHQPSNNDLNTTE-KRAAERHPEKYQGSSVSNLHVEPCGTNTHASSLQHENSSLLLTKDRMNVEKAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCERKEWNKQKLPCSENPRD-TEDVPWITLNSSIQKVNEWFSRSDELLGSD-DSHDGESESNAKVADVLDV-------------------LNEVDEYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHV--TENLIIGAFVTEPQI-IQERPLTNKLKRKRRPTSGLHPEDFIKKADLAV-QKTPEMINQGTNQTEQNGQVMNITNSGHENKTKGDSIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNIHNSKAPKKNRLRRKSSTRHIHALELVVSRNLSPPNCTELQIDSCSSSEEIKK-KKYNQMPVRHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPELKLTNAPGSFTKCSNTSELKEFVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGE-RVLQTE---RSVESSSISLVPGTDYGTQESISLLEVSTLGKAKT-EPNKCVSQCAAFENPKGLIHGCSKDNRNDTEGFKYPLGHEVNH-SRETSIEMEESELDAQYLQNTFKVSKRQSFAPFSNPGNAEEECATFSAHSGSLKKQSPKVTFECEQKEENQGKN-ESNIKPVQTVNITAGFPVVGQKDKP-VDNAKCS--IKGGSRFCLSSQFRGNETGLITPNKHGLLQNPYRIPPLFPIKSFVKTKCKKNLLEENFEEHSMSPEREMGNEN-IPSTVSTISRNNIRENVFKEASSSNINEVGSST-------NEVGSSINEIGSSDENIQAELGRNRGPKLNAMLRLGVLQPEVYKQSLPGSNCKHPEIKKQEY-EEVVQTVNTDFSPYLISDNLEQPMGSSHASQVCSETPDDLLDD--GEIKEDTSFAENDIKESSAVFSKSVQKGELSRSPSPFTH-THLAQGYR-RGAKKLESSEENLSSEDEELPCFQHLLFGKVNNIPS-QSTRHSTVATECLSKNTEENLLSLKNSLNDC--SNQVILAKASQEHHLSEETKCSASLFSSQCSELEDLTANT--------NTQDPFLIG--SSKQMRHQSESQGVGLSDKELVSDD----EERGTGLEENN-QEEQSMDSN-LGEAASGCESETSVSEDCSGLSSQSDILTTQ-------QRDTMQHNLIKLQQEMAELEAVLEQHGSQPS-NSYPSIISDSSA--LEDLRNPEQSTSEK---------------------AVLTSQKSSEYPISQNPEGLSADKFEVSADSSTSK-NKEPGVER---------------------SSPSKCPSL-DDRWYMHSCSGSLQNRNYPSQEELIKVVDVEEQQLEES-------------GPHDLTETSYLPRQDLEGTP-YLESGISLFSDDPESDPSEDRAPESARVGNIPSSTSALKVPQLKVAESAQSPAAAHTTDTAGYNAMEESVSREKPELTASTER-VNKRMSMVVSGLTPEEFMLVYKFARKHHITLTNLITEETTHVVMKTDAEFVCERTLKYFLGIAGGKWVVSYFWVTQSIKERKMLNEHDFEVRGDVVNGRNHQGPKRARES-----QDRKIFRGLEICCYGPFTNMPT-------------------------------------------------------------------------DQLEWMVQLCGASVVKELSSFTLGTG---VHPIVVVQPDAWTEDNGFHAIGQMCEAPVVTREWVLDSVALYQCQELDT---YLIPQIPHSHY-----------------------------------"
In [ ]:
#check percentage of positions in the MSA that score above a certain conservation threshold

def count_positions_above_threshold(sequence_data, threshold):
    amino_acids_to_plot = ['-',"A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V",'X']

    # Filter out empty sequences
    sequence_data = {k: v for k, v in sequence_data.items() if v}

    # Calculate the maximum length of the sequences
    max_length = max(len(seq) for seq in sequence_data.values())

    # Create a dictionary to store the frequency of each amino acid at each position
    amino_acid_frequencies = {aa: [0] * max_length for aa in amino_acids_to_plot}

    # Iterate over the sequences and update the amino acid frequencies
    for sequence in sequence_data.values():
        for i, aa in enumerate(sequence):
            amino_acid_frequencies[aa][i] += 1

    # Count the number of positions above the threshold
    count = 0
    for i in range(max_length):
        frequencies = [amino_acid_frequencies[aa][i] for aa in amino_acids_to_plot]
        max_frequency = max(frequencies)
        if max_frequency / sum(frequencies) > threshold:
            count += 1

    return count

# % of positions with more than 70% accordance
print(count_positions_above_threshold(fasta_file, 0.7)/len(consensus_sequence))
#% of positions with more than 80% accordance
print(count_positions_above_threshold(fasta_file, 0.8)/len(consensus_sequence))
#% of positions with more than 90% accordance
print(count_positions_above_threshold(fasta_file, 0.9)/len(consensus_sequence))
#% of positions with more than 95% accordance
print(count_positions_above_threshold(fasta_file, 0.95)/len(consensus_sequence))
0.8110477860587462
0.6861025865848313
0.4270056992547128
0.25208241999123193
In [ ]:
#create sequence logo with split sequence

def create_sequence_logo(sequence_data, split_positions=None, columns=1, figsize=(16, 8), scaling_factor=0.04):

    amino_acids_to_plot = ['-',"A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V",'X']
    # Filter out empty sequences
    sequence_data = {k: v for k, v in sequence_data.items() if v}

    # Calculate the maximum length of the sequences
    max_length = max(len(seq) for seq in sequence_data.values())

    # Create a dictionary to store the frequency of each amino acid at each position
    amino_acid_frequencies = {aa: [0] * max_length for aa in amino_acids_to_plot}

    # Iterate over the sequences and update the amino acid frequencies
    for sequence in sequence_data.values():
        for i, aa in enumerate(sequence):
            amino_acid_frequencies[aa][i] += 1

    # Determine the splitting positions
    if split_positions is None:
        split_positions = [0, max_length]

    # Adjust the number of columns if needed
    num_plots = len(split_positions) - 1
    rows = num_plots // columns + (num_plots % columns > 0)

    # Create subplots with increased vertical gap
    fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False, gridspec_kw={'hspace': 0.5})
    axs = axs.ravel()

    # Plotting the amino acid frequencies by position for each subset
    term = False
    for i, ax in enumerate(axs):
        start = split_positions[i]

        if i == num_plots-1:  # Handle the last subset separately
            print('end')
            end = max_length
            term = True

        else:
            end = split_positions[i + 1]
        subset_data = {k: v[start:end] for k, v in sequence_data.items()}

        # Plotting the amino acid frequencies by position for each amino acid
        for aa in amino_acids_to_plot:
            frequencies = amino_acid_frequencies[aa][start:end]

            if any(frequencies):
                ax.plot(range(1, len(frequencies) + 1), [f * scaling_factor for f in frequencies], label=aa)

        ax.set_xlim(1, end - start + 1)
        ax.set_xticks(range(1, end - start + 1))
        ax.set_xticklabels(range(start + 1, end + 1))
        ax.set_xlabel('Position')
        ax.set_ylabel('Frequency')

        ax.set_title(f"Sequence Logo - Position {start+1}-{end}")
        if term:
            break
    # Create a single legend outside the loop
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc='center', ncol=1)
#     fig.legend(handles, labels,  loc=(1.04, 0))

#     plt.tight_layout()
    plt.show()

split_positions = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, max(len(seq) for seq in fasta_file.values())]  # Manually specify splitting positions

create_sequence_logo(fasta_file, split_positions=split_positions, columns=2, figsize=(50, 30), scaling_factor=0.011)
end
In [ ]:
#create sequence logo with sequence split into domains

def create_sequence_logo(sequence_data, split_positions=None, columns=1, figsize=(16, 8), scaling_factor=0.04):

    amino_acids_to_plot = ['-',"A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V",'X']
    # Filter out empty sequences
    sequence_data = {k: v for k, v in sequence_data.items() if v}

    # Calculate the maximum length of the sequences
    max_length = max(len(seq) for seq in sequence_data.values())

    # Create a dictionary to store the frequency of each amino acid at each position
    amino_acid_frequencies = {aa: [0] * max_length for aa in amino_acids_to_plot}

    # Iterate over the sequences and update the amino acid frequencies
    for sequence in sequence_data.values():
        for i, aa in enumerate(sequence):
            amino_acid_frequencies[aa][i] += 1

    # Determine the splitting positions
    if split_positions is None:
        split_positions = [0, max_length]

    # Adjust the number of columns if needed
    num_plots = len(split_positions) - 1
    rows = num_plots // columns + (num_plots % columns > 0)

    # Create subplots with increased vertical gap
    fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False, gridspec_kw={'hspace': 0.5})
    axs = axs.ravel()

    # Plotting the amino acid frequencies by position for each subset
    term = False
    for i, ax in enumerate(axs):
        start = split_positions[i]

        if i == num_plots-1:  # Handle the last subset separately
            print('end')
            end = max_length
            term = True

        else:
            end = split_positions[i + 1]
        subset_data = {k: v[start:end] for k, v in sequence_data.items()}

        # Plotting the amino acid frequencies by position for each amino acid
        for aa in amino_acids_to_plot:
            frequencies = amino_acid_frequencies[aa][start:end]

            if any(frequencies):
                ax.plot(range(1, len(frequencies) + 1), [f * scaling_factor for f in frequencies], label=aa)

        ax.set_xlim(1, end - start + 1)
        ax.set_xticks(range(1, end - start + 1))
        ax.set_xticklabels(range(start + 1, end + 1))
        ax.set_xlabel('Position')
        ax.set_ylabel('Frequency')

        ax.set_title(f"Sequence Logo - Position {start+1}-{end}")
        if term:
            break
    # Create a single legend outside the loop
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc='center', ncol=1)
#     fig.legend(handles, labels,  loc=(1.04, 0))

#     plt.tight_layout()
    plt.show()

split_positions = [0,352,1929, max(len(seq) for seq in fasta_file.values())]  # Manually specify splitting positions

create_sequence_logo(fasta_file, split_positions=split_positions, columns=2, figsize=(60, 8), scaling_factor=0.011)
end
In [ ]:
#create comparison between human BRCA1 and the consensus sequence

def compare_sequences(consensus_sequence, other_sequence, split_positions=None, columns=1, figsize=(16, 8)):
    # Check if the sequences have the same length
    if len(consensus_sequence) != len(other_sequence):
        print("Sequences have different lengths.")
        return

    # Determine the splitting positions
    if split_positions is None:
        split_positions = [0, len(consensus_sequence)]

    # Adjust the number of columns if needed
    num_plots = len(split_positions) - 1
    rows = num_plots // columns + (num_plots % columns > 0)

    # Create subplots
    fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False)
    axs = axs.ravel()

    # Iterate over the subsets
    for i, ax in enumerate(axs):
        start = split_positions[i]
        end = split_positions[i + 1]

        # Create a plot to visualize the sequences within the subset
        for j in range(start, end):
            consensus_aa = consensus_sequence[j]
            other_aa = other_sequence[j]
            if consensus_aa == other_aa:
                ax.text(j - start, 0.5, consensus_aa, color='green', fontsize=60, ha='center')
            else:
                ax.text(j - start, 0.7, consensus_aa, color='green', fontsize=60, ha='center')
                ax.text(j - start, 0.3, other_aa, color='red', fontsize=60, ha='center')

        # Set plot properties for each subset
        ax.set_xlim(-0.5, end - start - 0.5)
        ax.set_ylim(0, 1)
        ax.set_xticks(range(end - start))
        ax.set_yticks([])
        ax.set_xlabel('Position')
        ax.set_title(f'Subset {i+1}')

    plt.tight_layout()
    plt.show()

# Example usage:

split_positions = [0, len(consensus_sequence)]  # Manually specify splitting positions

compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(200, 30))
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[25], line 51
     47 # Example usage:
     49 split_positions = [0, len(consensus_sequence)]  # Manually specify splitting positions
---> 51 compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(200, 30))

Cell In[25], line 24, in compare_sequences(consensus_sequence, other_sequence, split_positions, columns, figsize)
     22 for i, ax in enumerate(axs):
     23     start = split_positions[i]
---> 24     end = split_positions[i + 1]
     26     # Create a plot to visualize the sequences within the subset
     27     for j in range(start, end):

IndexError: list index out of range
In [ ]:
#split up again

def compare_sequences(consensus_sequence, other_sequence, split_positions=None, columns=1, figsize=(16, 8)):
    # Check if the sequences have the same length
    if len(consensus_sequence) != len(other_sequence):
        print("Sequences have different lengths.")
        return

    # Determine the splitting positions
    if split_positions is None:
        split_positions = [0, len(consensus_sequence)]

    # Adjust the number of columns if needed
    num_plots = len(split_positions) - 1
    rows = num_plots // columns + (num_plots % columns > 0)

    # Create subplots
    fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False)
    axs = axs.ravel()

    # Iterate over the subsets
    for i, ax in enumerate(axs):
        start = split_positions[i]
        end = split_positions[i + 1]

        # Create a plot to visualize the sequences within the subset
        for j in range(start, end):
            consensus_aa = consensus_sequence[j]
            other_aa = other_sequence[j]
            if consensus_aa == other_aa:
                ax.text(j - start, 0.5, consensus_aa, color='green', fontsize=20, ha='center')
            else:
                ax.text(j - start, 0.7, consensus_aa, color='green', fontsize=20, ha='center')
                ax.text(j - start, 0.3, other_aa, color='red', fontsize=20, ha='center')

        # Set plot properties for each subset
        ax.set_xlim(-0.5, end - start - 0.5)
        ax.set_ylim(0, 1)
        ax.set_xticks(range(end - start))
        ax.set_yticks([])
        ax.set_xlabel('Position')
        ax.set_title(f'Subset {i+1}')

    plt.tight_layout()
    plt.show()

# Example usage:

split_positions = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, len(consensus_sequence)]  # Manually specify splitting positions

compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(50, 30))
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[27], line 51
     47 # Example usage:
     49 split_positions = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, len(consensus_sequence)]  # Manually specify splitting positions
---> 51 compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(50, 30))

Cell In[27], line 24, in compare_sequences(consensus_sequence, other_sequence, split_positions, columns, figsize)
     22 for i, ax in enumerate(axs):
     23     start = split_positions[i]
---> 24     end = split_positions[i + 1]
     26     # Create a plot to visualize the sequences within the subset
     27     for j in range(start, end):

IndexError: list index out of range
In [ ]:
#split up into domains again

def compare_sequences(consensus_sequence, other_sequence, split_positions=None, columns=1, figsize=(16, 8)):
    # Check if the sequences have the same length
    if len(consensus_sequence) != len(other_sequence):
        print("Sequences have different lengths.")
        return

    # Determine the splitting positions
    if split_positions is None:
        split_positions = [0, len(consensus_sequence)]

    # Adjust the number of columns if needed
    num_plots = len(split_positions) - 1
    rows = num_plots // columns + (num_plots % columns > 0)

    # Create subplots
    fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False)
    axs = axs.ravel()

    # Iterate over the subsets
    for i, ax in enumerate(axs):
        start = split_positions[i]
        end = split_positions[i + 1]

        # Create a plot to visualize the sequences within the subset
        for j in range(start, end):
            consensus_aa = consensus_sequence[j]
            other_aa = other_sequence[j]
            if consensus_aa == other_aa:
                ax.text(j - start, 0.5, consensus_aa, color='green', fontsize=60, ha='center')
            else:
                ax.text(j - start, 0.7, consensus_aa, color='green', fontsize=60, ha='center')
                ax.text(j - start, 0.3, other_aa, color='red', fontsize=60, ha='center')

        # Set plot properties for each subset
        ax.set_xlim(-0.5, end - start - 0.5)
        ax.set_ylim(0, 1)
        ax.set_xticks(range(end - start))
        ax.set_yticks([])
        ax.set_xlabel('Position')
        ax.set_title(f'Subset {i+1}')

    plt.tight_layout()
    plt.show()

# Example usage:

split_positions = [0,352,1929, max(len(seq) for seq in fasta_file.values())]  # Manually specify splitting positions
compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(200, 20))
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[29], line 50
     47 # Example usage:
     49 split_positions = [0,352,1929, max(len(seq) for seq in fasta_file.values())]  # Manually specify splitting positions
---> 50 compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(200, 20))

Cell In[29], line 24, in compare_sequences(consensus_sequence, other_sequence, split_positions, columns, figsize)
     22 for i, ax in enumerate(axs):
     23     start = split_positions[i]
---> 24     end = split_positions[i + 1]
     26     # Create a plot to visualize the sequences within the subset
     27     for j in range(start, end):

IndexError: list index out of range

Post-translational Modifications¶

In [ ]:
data_raw = BRCA1_raw.copy()
In [ ]:
position = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, ..., 1855, 1855, 1855, 1855, 1855]  
In [ ]:
position_df = pd.DataFrame({'position': position})
In [ ]:
data_with_position = pd.concat([data_raw, position_df], axis=1)
In [ ]:
data_raw['position'] = data_raw['mutant'].str.extract('(\d+)').astype(int)
In [ ]:
print(data_raw.columns)
Index(['mutant', 'mutated_sequence', 'DMS_score', 'DMS_score_bin', 'position'], dtype='object')
In [ ]:
test1=data_raw.mutant.str.extract(r'(\d)([ARNDCQEGHILKTMfPOUSTWYV])')
test1
Out[ ]:
0 1
0 1 I
1 1 V
2 1 T
3 1 R
4 1 L
... ... ...
1832 5 R
1833 5 M
1834 5 L
1835 5 K
1836 5 T

1837 rows × 2 columns

In [ ]:
data_raw["new_aa"]=test1[1]
data_raw.head
Out[ ]:
<bound method NDFrame.head of       mutant                                   mutated_sequence  DMS_score  \
0        M1I  IDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -2.502128   
1        M1V  VDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -2.025645   
2        M1T  TDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -1.656569   
3        M1R  RDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -2.481580   
4        M1L  LDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -2.516529   
...      ...                                                ...        ...   
1832  I1855R  MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -0.464328   
1833  I1855M  MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...   0.021941   
1834  I1855L  MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...   0.233297   
1835  I1855K  MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -0.863844   
1836  I1855T  MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -0.291519   

      DMS_score_bin  position new_aa  
0                 0         1      I  
1                 0         1      V  
2                 0         1      T  
3                 0         1      R  
4                 0         1      L  
...             ...       ...    ...  
1832              1      1855      R  
1833              1      1855      M  
1834              1      1855      L  
1835              1      1855      K  
1836              1      1855      T  

[1837 rows x 6 columns]>
In [ ]:
copy_raw = data_raw[data_raw['DMS_score_bin'] == 0].copy()
In [ ]:
# Adding a new column 'old_aa' 
copy_raw['old_aa'] = copy_raw['mutant'].str[0]

print(data_raw.head())
  mutant                                   mutated_sequence  DMS_score  \
0    M1I  IDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -2.502128   
1    M1V  VDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -2.025645   
2    M1T  TDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -1.656569   
3    M1R  RDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -2.481580   
4    M1L  LDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...  -2.516529   

   DMS_score_bin  position new_aa  
0              0         1      I  
1              0         1      V  
2              0         1      T  
3              0         1      R  
4              0         1      L  
In [ ]:
# creating a pivot table 
pivot_table = copy_raw.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')

fig, ax = plt.subplots(figsize=(50,10))

# heatmap
sns.heatmap(pivot_table, cmap='coolwarm_r')

# adding annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        new_aa = pivot_table.index[i]
        position = pivot_table.columns[j]
        old_aa = copy_raw[(copy_raw['new_aa'] == new_aa) & (copy_raw['position'] == position)]['old_aa'].values
        if len(old_aa) > 0:
            ax.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)

plt.xlabel('position')
plt.ylabel('new_aa')
plt.title('DMS Scores Heatmap')

plt.show()
In [ ]:
# List of polar amino acids and filtering them out of the dataset
polar_amino_acids = ['S', 'T', 'Y', 'C', 'N', 'Q']
polar_data = copy_raw[copy_raw['new_aa'].isin(polar_amino_acids)][['new_aa', 'position', 'DMS_score', 'old_aa']]

# Creating a pivot table 
pivot_table = polar_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')


plt.figure(figsize=(20, 6))

sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)

# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        new_aa = pivot_table.index[i]
        position = pivot_table.columns[j]
        old_aa = polar_data[(polar_data['new_aa'] == new_aa) & (polar_data['position'] == position)]['old_aa'].values
        if len(old_aa) > 0:
            plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)


plt.xlabel('Position')
plt.ylabel('new_aa polar')
plt.title('DMS Scores of new Polar Amino Acids')

plt.show()
In [ ]:
# List of non-polar amino acids and filtering them out of the dataset
non_polar_amino_acids = ['A', 'V', 'L', 'I', 'P', 'M', 'F', 'W', 'G']
non_polar_data = copy_raw[copy_raw['new_aa'].isin(non_polar_amino_acids)][['new_aa', 'position', 'DMS_score']]

# Merge with the original dataframe to include the "old_aa" column
non_polar_data = non_polar_data.merge(copy_raw[['new_aa', 'position', 'old_aa']], on=['new_aa', 'position'])

# Creating a pivot table 
pivot_table = non_polar_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')


plt.figure(figsize=(30, 6))

sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)

# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        new_aa = pivot_table.index[i]
        position = pivot_table.columns[j]
        old_aa = non_polar_data[(non_polar_data['new_aa'] == new_aa) & (non_polar_data['position'] == position)]['old_aa'].values
        if len(old_aa) > 0:
            plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)

plt.xlabel('Position')
plt.ylabel('new_aa non-polar')
plt.title('DMS Scores of new Non-Polar Amino Acids')

plt.show()
In [ ]:
# List of positively charged amino acids and filtering them out of the dataset
positively_charged_amino_acids = ['R', 'H', 'K']
positively_charged_data = copy_raw[copy_raw['new_aa'].isin(positively_charged_amino_acids)][['new_aa', 'position', 'DMS_score', 'old_aa']]

# Creating a pivot table 
pivot_table = positively_charged_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')


plt.figure(figsize=(30, 6))

sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)

# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        new_aa = pivot_table.index[i]
        position = pivot_table.columns[j]
        old_aa = positively_charged_data[(positively_charged_data['new_aa'] == new_aa) & (positively_charged_data['position'] == position)]['old_aa'].values
        if len(old_aa) > 0:
            plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)


plt.xlabel('Position')
plt.ylabel('new_aa positive charge')
plt.title('DMS Scores of new Positively Charged Amino Acids')

plt.show()
In [ ]:
# List of negatively charged amino acids and filtering them out of the dataset
negatively_charged_amino_acids = ['D', 'E']
negatively_charged_data = copy_raw[copy_raw['new_aa'].isin(negatively_charged_amino_acids)][['new_aa', 'position', 'DMS_score', 'old_aa']]

# Creating a pivot table
pivot_table = negatively_charged_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')


plt.figure(figsize=(20, 6))

sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)

# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        new_aa = pivot_table.index[i]
        position = pivot_table.columns[j]
        old_aa = negatively_charged_data[(negatively_charged_data['new_aa'] == new_aa) & (negatively_charged_data['position'] == position)]['old_aa'].values
        if len(old_aa) > 0:
            plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)


plt.xlabel('Position')
plt.ylabel('new_aa negative charge')
plt.title('DMS Scores of new Negatively Charged Amino Acids')

plt.show()
In [ ]:
# List of essential amino acids and filtering them out of the dataset
essential_amino_acids = ['H','I','L','K','M','F','T','W','V']
essential_data = copy_raw[copy_raw['old_aa'].isin(essential_amino_acids)][['old_aa', 'position', 'DMS_score', 'new_aa']]

# Creating a pivot table 
pivot_table = essential_data.pivot_table(index='old_aa', columns='position', values='DMS_score', aggfunc='mean')


plt.figure(figsize=(30, 6))

sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)

# Add annotations for the "new_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        old_aa = pivot_table.index[i]
        position = pivot_table.columns[j]
        new_aa = essential_data[(essential_data['old_aa'] == old_aa) & (essential_data['position'] == position)]['new_aa'].values
        if len(new_aa) > 0:
            plt.text(j + 0.5, i + 0.5, new_aa[0], ha='center', va='center', fontsize=8)


plt.xlabel('Position')
plt.ylabel('old_aa essential')
plt.title('DMS Scores of Essential Amino Acids')

plt.show()
In [ ]:
# List of essential amino acids and filtering them out of the dataset
essential_amino_acids = ['H', 'I', 'L', 'K', 'M', 'F', 'T', 'W', 'V']
essential_data = copy_raw[copy_raw['new_aa'].isin(essential_amino_acids)][['new_aa', 'position', 'DMS_score', 'old_aa']]

# Creating a pivot table 
pivot_table = essential_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')


plt.figure(figsize=(30, 6))

sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)

# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        new_aa = pivot_table.index[i]
        position = pivot_table.columns[j]
        old_aa = essential_data[(essential_data['new_aa'] == new_aa) & (essential_data['position'] == position)]['old_aa'].values
        if len(old_aa) > 0:
            plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)


plt.xlabel('Position')
plt.ylabel('new_aa essential')
plt.title('DMS Scores of Essential Amino Acids')

plt.show()
In [ ]:
# List of non-essential amino acids and filtering them out of the dataset
non_essential_amino_acids = ['A', 'D', 'N', 'E', 'S', 'U', 'O']

non_essential_data = copy_raw[~copy_raw['old_aa'].isin(non_essential_amino_acids)][['old_aa', 'position', 'DMS_score', 'new_aa']]

# Creating a pivot table 
pivot_table = non_essential_data.pivot_table(index='old_aa', columns='position', values='DMS_score', aggfunc='mean')


plt.figure(figsize=(20, 6))

sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)

# Add annotations for the "new_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        old_aa = pivot_table.index[i]
        position = pivot_table.columns[j]
        new_aa = non_essential_data[(non_essential_data['old_aa'] == old_aa) & (non_essential_data['position'] == position)]['new_aa'].values
        if len(new_aa) > 0:
            plt.text(j + 0.5, i + 0.5, new_aa[0], ha='center', va='center', fontsize=8)

plt.xlabel('Position')
plt.ylabel('old_aa non-essential')
plt.title('DMS Scores of Non-Essential Amino Acids')

plt.show()
In [ ]:
# List of non-essential amino acids and filtering them out of the dataset
non_essential_amino_acids = ['A', 'D', 'N', 'E', 'S', 'U', 'O']
non_essential_data = copy_raw[~copy_raw['new_aa'].isin(non_essential_amino_acids)][['new_aa', 'position', 'DMS_score', 'old_aa']]

# Creating a pivot table 
pivot_table = non_essential_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')


plt.figure(figsize=(20, 6))

sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)

# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        new_aa = pivot_table.index[i]
        position = pivot_table.columns[j]
        old_aa = non_essential_data[(non_essential_data['new_aa'] == new_aa) & (non_essential_data['position'] == position)]['old_aa'].values
        if len(old_aa) > 0:
            plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)

plt.xlabel('Position')
plt.ylabel('new_aa non-essential')
plt.title('DMS Scores of Non-Essential Amino Acids')

plt.show()
In [ ]:
copy_phosphorylation = copy_raw.loc[copy_raw['old_aa'].isin(['S', 'T', 'Y'])].copy()
In [ ]:
# Filtering the dataset for rows excluding 'S', 'T', or 'Y' in the "new_aa" column
filtered_data = copy_phosphorylation.loc[~copy_phosphorylation['new_aa'].isin(['S', 'T', 'Y'])]

# Reshaping the filtered data for heatmap visualization
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")

# Creating a separate dataframe to store the "old_aa" values
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")

# the heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap="coolwarm_r", annot=False, cbar=True)
plt.title("DMS_score phosphorylation")
plt.xlabel("Position")
plt.ylabel("new_aa")

# Add annotations for the "old_aa" values
for i in range(len(heatmap_data.index)):
    for j in range(len(heatmap_data.columns)):
        plt.text(j + 0.5, i + 0.5, old_aa_data.iloc[i, j], ha='center', va='center', fontsize=8)

plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/908054992.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/908054992.py:8: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
In [ ]:
copy_new_phosphorylation = copy_raw[copy_raw['new_aa'].isin(['S', 'T', 'Y'])].copy()
In [ ]:
# Filtering the dataset for rows that have any value except 'S', 'T', or 'Y' in the "old_aa" column
filtered_data = copy_new_phosphorylation[~copy_new_phosphorylation['old_aa'].isin(['S', 'T', 'Y'])]

# Reshaping the filtered data for heatmap visualization
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")

# Creating a separate dataframe to store the "old_aa" values
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")

# The heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap="coolwarm_r", annot=old_aa_data, fmt="", cbar=True)
plt.title("DMS_score phosphorylation")
plt.xlabel("Position")
plt.ylabel("new_aa")

plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/2442374556.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/2442374556.py:8: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
In [ ]:
copy_new_lys = copy_raw[copy_raw['new_aa'].isin(['K'])].copy()
In [ ]:
# Filtering the dataset for rows that have 'K' in the "new_aa" column
filtered_data = copy_new_lys[copy_new_lys['new_aa'] == 'K']

# Reshaping the filtered data for heatmap visualization
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")

# Creating a separate dataframe to store the "old_aa" values
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")

# The heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap="coolwarm_r", annot=old_aa_data, fmt="", cbar=True)
plt.title("DMS_score acetylation")
plt.xlabel("Position")
plt.ylabel("new_aa")

plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/1979893177.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/1979893177.py:8: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
In [ ]:
# Creating a copy of the dataset with rows that have 'K' in the "old_aa" column
copy_old_lys = copy_raw[copy_raw['old_aa'].str.contains('K')].copy()
In [ ]:
# Filtering the dataset for rows that have 'K' in the "old_aa" column
filtered_data = copy_old_lys[copy_old_lys['old_aa'] == 'K']

# Reshaping the filtered data for heatmap visualization
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")

# Creating a separate dataframe to store the "old_aa" values
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")

# The heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap="coolwarm_r", annot=old_aa_data, fmt="", cbar=True)
plt.title("DMS_score acetylationg")
plt.xlabel("Position")
plt.ylabel("new_aa")

plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/3347817003.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/3347817003.py:8: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
In [ ]:
copy_old_glyc = copy_raw[copy_raw['old_aa'].isin(['S', 'T', 'N'])].copy()
In [ ]:
# Creating a pivot table for DMS scores
pivot_table = copy_old_glyc.pivot('new_aa', 'position', 'DMS_score')

# The heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_table, cmap='coolwarm_r', linewidths=0.5, linecolor='lightgray')

# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        mask = (copy_old_glyc['new_aa'] == pivot_table.index[i]) & (copy_old_glyc['position'] == pivot_table.columns[j])
        if mask.any():
            plt.text(j + 0.5, i + 0.5, copy_old_glyc.loc[mask, 'old_aa'].values[0],
                     ha='center', va='center', fontsize=8)

plt.title('DMS glycolysation')
plt.xlabel('Position')
plt.ylabel('new_aa')

plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/1995355757.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  pivot_table = copy_old_glyc.pivot('new_aa', 'position', 'DMS_score')
In [ ]:
filtered_data = copy_old_glyc[~copy_old_glyc['old_aa'].isin(['S', 'T', 'N'])]
In [ ]:
copy_new_glyc = copy_raw[copy_raw['new_aa'].isin(['S', 'T', 'N'])].copy()
In [ ]:
# Creating a pivot table for DMS scores
pivot_table = copy_new_glyc.pivot('new_aa', 'position', 'DMS_score')

# The heatmap
plt.figure(figsize=(23, 10))
sns.heatmap(pivot_table, cmap='coolwarm_r', linewidths=0.5, linecolor='lightgray')

# Adding annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
    for j in range(len(pivot_table.columns)):
        mask = (copy_new_glyc['new_aa'] == pivot_table.index[i]) & (copy_new_glyc['position'] == pivot_table.columns[j])
        if mask.any():
            plt.text(j + 0.5, i + 0.5, copy_new_glyc.loc[mask, 'old_aa'].values[0],
                     ha='center', va='center', fontsize=8)

plt.title('DMS glycolysation')
plt.xlabel('Position')
plt.ylabel('new_aa')

plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/437836168.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
  pivot_table = copy_new_glyc.pivot('new_aa', 'position', 'DMS_score')
In [ ]:
data_raw
Out[ ]:
mutant mutated_sequence DMS_score DMS_score_bin position new_aa
0 M1I IDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.502128 0 1 I
1 M1V VDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.025645 0 1 V
2 M1T TDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -1.656569 0 1 T
3 M1R RDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.481580 0 1 R
4 M1L LDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.516529 0 1 L
... ... ... ... ... ... ...
1832 I1855R MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.464328 1 1855 R
1833 I1855M MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... 0.021941 1 1855 M
1834 I1855L MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... 0.233297 1 1855 L
1835 I1855K MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.863844 1 1855 K
1836 I1855T MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.291519 1 1855 T

1837 rows × 6 columns

In [ ]:
data_fit = df[df['DMS_score_bin'] == 1].copy()
In [ ]:
data_fit = copy_raw[copy_raw['DMS_score_bin'] == 1][['mutant', 'mutated_sequence', 'DMS_score', 'DMS_score_bin', 'position', 'new_aa', 'old_aa']].copy()
In [ ]:
data_fit['old_aa'] = data_fit['mutant'].str[0]
In [ ]:
data_fit
Out[ ]:
mutant mutated_sequence DMS_score DMS_score_bin position new_aa old_aa
6 D2N MNLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.682954 1 2 N D
7 D2A MALSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.180782 1 2 A D
8 D2E MELSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... 0.088696 1 2 E D
10 D2Y MYLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.256095 1 2 Y D
13 L3F MDFSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.341923 1 3 NaN L
... ... ... ... ... ... ... ...
1832 I1855R MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.464328 1 1855 R I
1833 I1855M MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... 0.021941 1 1855 M I
1834 I1855L MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... 0.233297 1 1855 L I
1835 I1855K MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.863844 1 1855 K I
1836 I1855T MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.291519 1 1855 T I

1363 rows × 7 columns

In [ ]:
fit_old_glyc = data_fit[data_fit['old_aa'].isin(['S', 'T', 'N'])].copy()
In [ ]:
fit_old_glyc
Out[ ]:
mutant mutated_sequence DMS_score DMS_score_bin position new_aa old_aa
17 S4Y MDLYALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.255390 1 4 Y S
18 S4A MDLAALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... 0.073691 1 4 A S
19 S4P MDLPALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.330096 1 4 P S
20 S4F MDLFALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.495239 1 4 NaN S
21 S4C MDLCALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.245299 1 4 C S
... ... ... ... ... ... ... ...
1815 T1852N MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.074500 1 1852 N T
1816 T1852P MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.067412 1 1852 P T
1817 T1852S MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.131134 1 1852 S T
1818 T1852A MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... 0.309628 1 1852 A T
1819 T1852I MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.152108 1 1852 I T

236 rows × 7 columns

In [ ]:
# Creating the pivot table for the heatmap
heatmap_data = fit_old_glyc.pivot(index='new_aa', columns='position', values='DMS_score')

# The heatmap
plt.figure(figsize=(10, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)

# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
    for j in range(len(heatmap_data.columns)):
        new_aa = heatmap_data.index[i]
        position = heatmap_data.columns[j]
        old_aa = fit_old_glyc.loc[(fit_old_glyc['new_aa'] == new_aa) & (fit_old_glyc['position'] == position), 'old_aa']
        if not old_aa.empty:
            ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')

plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit DMS glycolysation')

plt.show()
In [ ]:
fit_new_glyc = data_fit[data_fit['new_aa'].isin(['S', 'T', 'N'])].copy()
In [ ]:
# Creating the pivot table for the heatmap
heatmap_data = fit_new_glyc.pivot(index='new_aa', columns='position', values='DMS_score')

# The heatmap
plt.figure(figsize=(40, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)

# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
    for j in range(len(heatmap_data.columns)):
        new_aa = heatmap_data.index[i]
        position = heatmap_data.columns[j]
        old_aa = fit_new_glyc.loc[(fit_new_glyc['new_aa'] == new_aa) & (fit_new_glyc['position'] == position), 'old_aa']
        if not old_aa.empty:
            ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')

plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit DMS glycolysation')

plt.show()
In [ ]:
fit_old_acet = data_fit[data_fit['old_aa'].isin(['K'])].copy()
In [ ]:
# Creating the pivot table for the heatmap
heatmap_data = fit_old_acet.pivot(index='new_aa', columns='position', values='DMS_score')

# The heatmap
plt.figure(figsize=(10, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)

# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
    for j in range(len(heatmap_data.columns)):
        new_aa = heatmap_data.index[i]
        position = heatmap_data.columns[j]
        old_aa = fit_old_acet.loc[(fit_old_acet['new_aa'] == new_aa) & (fit_old_acet['position'] == position), 'old_aa']
        if not old_aa.empty:
            ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')

plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit DMS acetylation')

plt.show()
In [ ]:
fit_new_acet = data_fit[data_fit['new_aa'].isin(['K'])].copy()
In [ ]:
# Creating the pivot table for the heatmap
heatmap_data = fit_new_acet.pivot(index='new_aa', columns='position', values='DMS_score')

# The heatmap
plt.figure(figsize=(40, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)

# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
    for j in range(len(heatmap_data.columns)):
        new_aa = heatmap_data.index[i]
        position = heatmap_data.columns[j]
        old_aa = fit_new_acet.loc[(fit_new_acet['new_aa'] == new_aa) & (fit_new_acet['position'] == position), 'old_aa']
        if not old_aa.empty:
            ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')

plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit DMS acetylation')

plt.show()
In [ ]:
fit_old_phos = data_fit[data_fit['old_aa'].isin(['S', 'T', 'Y'])].copy()
In [ ]:
# Creating the pivot table for the heatmap
heatmap_data = fit_old_phos.pivot(index='new_aa', columns='position', values='DMS_score')

# The heatmap
plt.figure(figsize=(10, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)

# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
    for j in range(len(heatmap_data.columns)):
        new_aa = heatmap_data.index[i]
        position = heatmap_data.columns[j]
        old_aa = fit_old_phos.loc[(fit_old_phos['new_aa'] == new_aa) & (fit_old_phos['position'] == position), 'old_aa']
        if not old_aa.empty:
            ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')

plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit phosphorylation')

plt.show()
In [ ]:
fit_new_phos = data_fit[data_fit['new_aa'].isin(['S', 'T', 'Y'])].copy()
In [ ]:
# Creating the pivot table for the heatmap
heatmap_data = fit_new_phos.pivot(index='new_aa', columns='position', values='DMS_score')

# The heatmap
plt.figure(figsize=(40, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)

# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
    for j in range(len(heatmap_data.columns)):
        new_aa = heatmap_data.index[i]
        position = heatmap_data.columns[j]
        old_aa = fit_new_phos.loc[(fit_new_phos['new_aa'] == new_aa) & (fit_new_phos['position'] == position), 'old_aa']
        if not old_aa.empty:
            ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')

plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit phosphorylation')

plt.show()